{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 初始化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import os\n",
    "import sys\n",
    "sys.path.append('../')  # 返回notebook的上一级目录\n",
    "# sys.path.append('E:\\GitHub\\QA-abstract-and-reasoning')  # 效果同上"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "no colab\n"
     ]
    }
   ],
   "source": [
    "# 在google colab运行则执行以下代码\n",
    "try:\n",
    "    from google.colab import drive\n",
    "    drive_path = '/content/drive'\n",
    "    working_path = drive_path + \"/My Drive/QA\" # 工作路径\n",
    "    drive.mount(drive_path)\n",
    "    os.chdir(working_path)\n",
    "    sys.path.append(working_path)  # 环境变量\n",
    "    print(\"current working directory: \", os.getcwd())\n",
    "    \n",
    "    # %tensorflow_version 仅存在于 Colab\n",
    "    %tensorflow_version 2.x\n",
    "    print(\"run notebook in colab\")\n",
    "except:\n",
    "    print(\"no colab\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2.0.0'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.ticker as ticker\n",
    "np.set_printoptions(suppress=True)\n",
    "from utils.plot import plot_attention\n",
    "from utils.saveLoader import *\n",
    "from utils.config import *\n",
    "from layers import *\n",
    "from preprocess import Preprocess\n",
    "from gensim.models.word2vec import LineSentence, Word2Vec\n",
    "import tensorflow as tf\n",
    "# from model_layer import seq2seq_model\n",
    "import time\n",
    "tf.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[限制gpu内存增长](https://tensorflow.google.cn/guide/gpu#limiting_gpu_memory_growth)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 Physical GPUs, 1 Logical GPUs\n"
     ]
    }
   ],
   "source": [
    "from utils.config_gpu import config_gpu\n",
    "config_gpu()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_x,train_y,test_x = load_train_dataset()  # 数据集\n",
    "vocab,vocab_reversed = load_vocab(VOCAB_PAD)  # vocab\n",
    "embedding_matrix = np.loadtxt(EMBEDDING_MATRIX_PAD)  # 预训练层"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 设置参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'vocab_size': 32247, 'max_enc_len': 460, 'max_dec_len': 52, 'embed_size': 300, 'enc_units': 256, 'attn_units': 10, 'dec_units': 256, 'batch_size': 32, 'epochs': 6}\n"
     ]
    }
   ],
   "source": [
    "params = {}\n",
    "params[\"vocab_size\"] = len(vocab)\n",
    "params[\"max_enc_len\"] = train_x.shape[1]  # 260\n",
    "params[\"max_dec_len\"] = train_y.shape[1]  # 33\n",
    "params[\"embed_size\"] = embedding_matrix.shape[1]\n",
    "params[\"enc_units\"] = 256\n",
    "params[\"attn_units\"] = 10\n",
    "params[\"dec_units\"] = params[\"enc_units\"]\n",
    "params[\"batch_size\"] = 32\n",
    "params[\"epochs\"] = 6\n",
    "print(params)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 构建训练集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 取部分数据进行训练\n",
    "sample_num=256\n",
    "#sample_num = train_x.shape[0]\n",
    "dataset = tf.data.Dataset.from_tensor_slices((train_x[:sample_num], train_y[:sample_num])).shuffle(params[\"batch_size\"]*2+1)\n",
    "dataset = dataset.batch(params[\"batch_size\"], drop_remainder=True)\n",
    "steps_per_epoch = sample_num//params[\"batch_size\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 构建模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from seq2seq import *\n",
    "model=Seq2Seq(params)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存点设置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils.config import CKPT_DIR, CKPT_PREFIX\n",
    "from utils.saveLoader import del_all_files_of_dir\n",
    "# 为了开始重新训练而不是继续上次的训练\n",
    "del_all_files_of_dir(CKPT_DIR)\n",
    "ckpt = tf.train.Checkpoint(Seq2Seq=model)\n",
    "ckpt_manager = tf.train.CheckpointManager(ckpt, CKPT_DIR, max_to_keep=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model restored\n"
     ]
    }
   ],
   "source": [
    "ckpt.restore(ckpt_manager.latest_checkpoint)\n",
    "print(\"Model restored\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 训练"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[SparseCategoricalCrossentropy](https://tensorflow.google.cn/api_docs/python/tf/keras/losses/SparseCategoricalCrossentropy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "optimizer = tf.keras.optimizers.Adam(name='Adam',learning_rate=0.001)\n",
    "loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')\n",
    "\n",
    "pad_index=vocab['<PAD>']\n",
    "unk_index=vocab['<UNK>']\n",
    "\n",
    "def loss_function(real, pred):\n",
    "    pad_mask = tf.math.equal(real, pad_index)\n",
    "    unk_mask = tf.math.equal(real, unk_index)\n",
    "    # <PAD> 和 <UNK> 的损失都不算\n",
    "    mask = tf.math.logical_not(tf.math.logical_or(pad_mask,unk_mask))\n",
    "    loss_ = loss_object(real, pred)\n",
    "\n",
    "    mask = tf.cast(mask, dtype=loss_.dtype)\n",
    "    loss_ *= mask\n",
    "    return tf.reduce_mean(loss_)\n",
    "    # return tf.reduce_sum(loss_)/tf.reduce_sum(mask)\n",
    "\n",
    "def coverage_loss_function(real, pred, attn_dists):\n",
    "    # 先计算原本的损失\n",
    "    pad_mask = tf.math.equal(real, pad_index)\n",
    "    unk_mask = tf.math.equal(real, unk_index)\n",
    "    # <PAD> 和 <UNK> 的损失都不算\n",
    "    mask = tf.math.logical_not(tf.math.logical_or(pad_mask,unk_mask))\n",
    "    loss_ = loss_object(real, pred)\n",
    "\n",
    "    mask = tf.cast(mask, dtype=loss_.dtype)\n",
    "    loss_ *= mask\n",
    "        \n",
    "    coverage = tf.zeros_like(attn_dists[0]) # shape (batch_size, attn_length). Initial coverage is zero.\n",
    "    #covlosses = [] # Coverage loss per decoder timestep. Will be list length max_dec_steps containing shape (batch_size).\n",
    "    covlosses = []\n",
    "    for i,a in enumerate(attn_dists):\n",
    "        covloss = tf.reduce_sum(tf.minimum(a, coverage), [1]) # calculate the coverage loss for this step\n",
    "        covlosses.append(covloss)\n",
    "        coverage += a # update the coverage vector\n",
    "    # coverage_loss = _mask_and_avg(covlosses)\n",
    "    \n",
    "    coverage_loss = 0\n",
    "    for i,_ in enumerate(covlosses):\n",
    "        covlosses[i] = covlosses[i] * tf.expand_dims(mask[:, i],1)\n",
    "        coverage_loss += tf.reduce_mean(covlosses[i])\n",
    "    coverage_loss /= len(covlosses)\n",
    "\n",
    "    # print(\"coverage loss\", (coverage_loss/ tf.reduce_sum(mask)).numpy())\n",
    "    return tf.reduce_mean(loss_) + 3*coverage_loss\n",
    "    #return (tf.reduce_sum(loss_)+coverage_loss) / tf.reduce_sum(mask)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(32245, 32246)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pad_index,unk_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 调试train_step()\n",
    "# inp, targ = next(iter(dataset))\n",
    "# pad_index=vocab['<PAD>']\n",
    "# unk_index=vocab['<UNK>']\n",
    "# enc_output, enc_hidden = model.call_encoder(inp)\n",
    "# dec_hidden = enc_hidden\n",
    "# dec_input = tf.expand_dims([vocab['<START>']] * params[\"batch_size\"], 1)\n",
    "# predictions, _ = model(dec_input, dec_hidden, enc_output, targ)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "@tf.function\n",
    "def train_step(inp, targ):\n",
    "    pad_index=vocab['<PAD>']\n",
    "    unk_index=vocab['<UNK>']\n",
    "    loss = 0\n",
    "    \n",
    "    with tf.GradientTape() as tape:\n",
    "        # 1. 构建encoder\n",
    "        enc_output, enc_hidden = model.call_encoder(inp)\n",
    "        # 2. 复制\n",
    "        dec_hidden = enc_hidden\n",
    "        # 3. <START> * BATCH_SIZE \n",
    "        dec_input = tf.expand_dims([vocab['<START>']] * params[\"batch_size\"], 1)\n",
    "        \n",
    "        # 逐个预测序列\n",
    "        predictions, _, attentions = model(dec_input, dec_hidden, enc_output, targ)\n",
    "        # print(predictions.shape)\n",
    "        # batch_loss = loss_function(targ[:, 1:], predictions)\n",
    "        # \n",
    "        batch_loss =  coverage_loss_function(targ[:, 1:], predictions, attentions)\n",
    "        \n",
    "        variables = model.encoder.trainable_variables + model.decoder.trainable_variables+ model.attention.trainable_variables\n",
    "    \n",
    "        gradients = tape.gradient(batch_loss, variables)\n",
    "\n",
    "        optimizer.apply_gradients(zip(gradients, variables))\n",
    "\n",
    "        return batch_loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Latest checkpoint restored!!\n"
     ]
    }
   ],
   "source": [
    "epochs = params[\"epochs\"]\n",
    "# 如果检查点存在，则恢复最新的检查点。\n",
    "if ckpt_manager.latest_checkpoint:\n",
    "    ckpt.restore(ckpt_manager.latest_checkpoint)\n",
    "    print ('Latest checkpoint restored!!')\n",
    "    \n",
    "for epoch in range(epochs):\n",
    "    start = time.time()\n",
    "    total_loss = 0\n",
    "\n",
    "    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch+1)):\n",
    "        \n",
    "        batch_loss = train_step(inp, targ)\n",
    "        total_loss += batch_loss\n",
    "\n",
    "        if batch % 1 == 0:\n",
    "            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,\n",
    "                                                         batch,\n",
    "                                                         batch_loss.numpy()))\n",
    "    # saving (checkpoint) the model every 2 epochs\n",
    "    if (epoch + 1) % 1 == 0:\n",
    "        ckpt_save_path = ckpt_manager.save()\n",
    "        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,\n",
    "                                                             ckpt_save_path))\n",
    "\n",
    "    print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n",
    "                                      total_loss / steps_per_epoch))\n",
    "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 载入模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果检查点存在，则恢复最新的检查点。\n",
    "ckpt.restore(ckpt_manager.latest_checkpoint)\n",
    "print(\"Model restored\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate(model,inputs):\n",
    "    attention_plot = np.zeros((params[\"max_dec_len\"], params[\"max_enc_len\"]))\n",
    "    \n",
    "    inputs = tf.convert_to_tensor(inputs)\n",
    "\n",
    "    result = ''\n",
    "    \n",
    "    hidden = [tf.zeros((1, params[\"enc_units\"]))]\n",
    "    enc_output, enc_hidden = model.encoder(inputs, hidden)\n",
    "\n",
    "    dec_hidden = enc_hidden\n",
    "    \n",
    "    dec_input = tf.expand_dims([vocab['<START>']], 0)\n",
    "    \n",
    "    context_vector, _ = model.attention(dec_hidden, enc_output)\n",
    "\n",
    "    for t in range(params[\"max_dec_len\"]):\n",
    "        \n",
    "        context_vector, attention_weights = model.attention(dec_hidden, enc_output)\n",
    "        \n",
    "        predictions, dec_hidden = model.decoder(dec_input,\n",
    "                                         dec_hidden,\n",
    "                                         enc_output,\n",
    "                                         context_vector)\n",
    "\n",
    "        # storing the attention weights to plot later on\n",
    "        attention_weights = tf.reshape(attention_weights, (-1, ))\n",
    "        \n",
    "        attention_plot[t] = attention_weights.numpy()\n",
    "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
    "\n",
    "        result += vocab_reversed[predicted_id] + ' '\n",
    "        if vocab_reversed[predicted_id] == '<STOP>':\n",
    "            return result, attention_plot\n",
    "\n",
    "        # the predicted ID is fed back into the model\n",
    "        dec_input = tf.expand_dims([predicted_id], 0)\n",
    "\n",
    "    return result, attention_plot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def translate(sentence):\n",
    "    st = preproc.sentence_proc(sentence)\n",
    "    sentence = preproc.sentence_proc_eval(sentence,params[\"max_enc_len\"]-2,vocab)\n",
    "    result, attention_plot = evaluate(model,sentence)\n",
    "\n",
    "    print('Input: %s' % (st))\n",
    "    print('Predicted translation: {}'.format(result))\n",
    "\n",
    "    attention_plot = attention_plot[:len(result.split(' ')), :len(st.split(' '))]\n",
    "    plot_attention(attention_plot, st.split(' '), result.split(' '))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentence = '方向机重，助力泵，方向机都换了还是一样'\n",
    "preproc = Preprocess()\n",
    "preproc.sentence_proc(sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import matplotlib as mpl\n",
    "# mpl.rcParams['font.family'] = 'STSong'  # 显示中文"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "t = \"检查 下 支臂 球头 ， 需要 检查 下 支臂 ， 需要 检查 下 支臂 ， 需要 检查 下 支臂 ， 需要 检查 下 支臂 ， 需要 检查 下 支臂 ， 需要 检查 下 支臂 ， 需要 检查 下 支臂 ， 需要 检查 下 支臂 ， 需要 检查 下 支臂 ， 需要 检查 \"\n",
    "a = t.split(\" \")\n",
    "len(a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "translate(sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下半部分\n",
    "assert False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def batch_predict(inps):\n",
    "    # 判断输入长度\n",
    "    batch_size=len(inps)\n",
    "    # 开辟结果存储list\n",
    "    preidicts=[''] * batch_size\n",
    "    \n",
    "    inps = tf.convert_to_tensor(inps)\n",
    "    # 0. 初始化隐藏层输入\n",
    "    hidden = [tf.zeros((batch_size, params[\"enc_units\"]))]\n",
    "    # 1. 构建encoder\n",
    "    enc_output, enc_hidden = model.encoder(inps, hidden)\n",
    "    # 2. 复制\n",
    "    dec_hidden = enc_hidden\n",
    "    # 3. <START> * BATCH_SIZE \n",
    "    dec_input = tf.expand_dims([vocab['<START>']] * batch_size, 1)\n",
    "    \n",
    "    context_vector, _ = model.attention(dec_hidden, enc_output)\n",
    "    # Teacher forcing - feeding the target as the next input\n",
    "    for t in range(params[\"max_dec_len\"]):\n",
    "        # 计算上下文\n",
    "        context_vector, attention_weights = model.attention(dec_hidden, enc_output)\n",
    "        # 单步预测\n",
    "        predictions, dec_hidden = model.decoder(dec_input,\n",
    "                                         dec_hidden,\n",
    "                                         enc_output,\n",
    "                                         context_vector)\n",
    "        \n",
    "        # id转换 贪婪搜索\n",
    "        predicted_ids = tf.argmax(predictions,axis=1).numpy()\n",
    "        \n",
    "        \n",
    "        for index,predicted_id in enumerate(predicted_ids):\n",
    "            preidicts[index]+= vocab_reversed[predicted_id] + ' '\n",
    "        \n",
    "        # using teacher forcing\n",
    "        dec_input = tf.expand_dims(predicted_ids, 1)\n",
    "\n",
    "    results=[]\n",
    "    for preidict in preidicts:\n",
    "        # 去掉句子前后空格\n",
    "        preidict=preidict.strip()\n",
    "        # 句子小于max len就结束了 截断\n",
    "        if '<STOP>' in preidict:\n",
    "            # 截断stop\n",
    "            preidict=preidict[:preidict.index('<STOP>')]\n",
    "        # 保存结果\n",
    "        results.append(preidict)\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 测试代码\n",
    "# ds = iter(dataset)\n",
    "# x,y = ds.next()\n",
    "# batch_predict(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import math"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def model_predict(data_X, batch_size):\n",
    "    # 存储结果\n",
    "    results=[]\n",
    "    # 样本数量\n",
    "    sample_size=len(data_X)\n",
    "    # batch 操作轮数 math.ceil向上取整 小数 +1\n",
    "    # 因为最后一个batch可能不足一个batch size 大小 ,但是依然需要计算  \n",
    "    steps_epoch = math.ceil(sample_size/batch_size)\n",
    "    # [0,steps_epoch)\n",
    "    for i in tqdm(range(steps_epoch)):\n",
    "        batch_data = data_X[i*batch_size:(i+1)*batch_size]\n",
    "        results+=batch_predict(batch_data)\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "# 128 或 256\n",
    "results=model_predict(test_x[:500],batch_size=256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 读入提交数据\n",
    "test_df=pd.read_csv(TEST_DATA)\n",
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def submit_proc(sentence):\n",
    "    sentence=sentence.lstrip(' ，！。？-.')\n",
    "    sentence=sentence.replace(' ','')\n",
    "    if sentence=='':\n",
    "        sentence='随时联系'\n",
    "    return sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx,result in enumerate(results):\n",
    "    if result=='':print(idx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 赋值结果\n",
    "test_df['Prediction']=results\n",
    "#　提取ID和预测结果两列\n",
    "test_df=test_df[['QID','Prediction']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df['Prediction']=test_df['Prediction'].apply(submit_proc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def del_repeat(sentence):\n",
    "    pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from utils.config import RESULT_PATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存结果.\n",
    "result_save_path = os.path.join(RESULT_PATH, \"my_first_result.csv\")\n",
    "test_df.to_csv(result_save_path, index=None,sep=',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df2=pd.read_csv(result_save_path)\n",
    "# 查看格式\n",
    "test_df2.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:tf2.0]",
   "language": "python",
   "name": "conda-env-tf2.0-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
